Loading Data
spam_train = read.csv("/Users/kevinnguyen/Downloads/spam-train.txt", header = F)
spam_test = read.csv("/Users/kevinnguyen/Downloads/spam-test.txt", header = F)
normalized_train = data.frame(scale(spam_train[1:57]), spam_train[58])
normalized_test = data.frame(scale(spam_test[1:57]), spam_test[58])
log_train = data.frame(log(spam_train[1:57] + 1), spam_train[58])
log_test = data.frame(log(spam_test[1:57] + 1), spam_test[58])
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
ds_train = data.frame(discretizeDF(spam_train[1:57], default = list(method = "fixed", breaks = c(-Inf, 0.00001, Inf), labels=F)), spam_train[58])
ds_train[1:57] = ds_train[1:57]-1
ds_test = data.frame(discretizeDF(spam_test[1:57], default = list(method = "fixed", breaks = c(-Inf, 0.00001, Inf), labels=F)), spam_test[58])
ds_test[1:57] = ds_test[1:57]-1
## Normalized Data
for(i in 1:57) {
if (i-1 %% 9 == 0) {
par(mfrow=c(3,3))
}
plot(normalized_train[,i], normalized_train[,58], main = names(normalized_train)[i])
}
for(i in 1:57) {
if (i-1 %% 9 == 0) {
par(mfrow=c(3,3))
}
boxplot(normalized_train[,i] ~ normalized_train[,58], main = names(normalized_train)[i])
}
## Log Data
for(i in 1:57) {
if (i-1 %% 9 == 0) {
par(mfrow=c(3,3))
}
plot(log_train[,i], log_train[,58], main = names(log_train)[i])
}
for(i in 1:57) {
if (i-1 %% 9 == 0) {
par(mfrow=c(3,3))
}
boxplot(log_train[,i] ~ log_train[,58], main = names(log_train)[i])
}
# Discretized Data
for (i in 1:57){
counts = table(ds_train[,i], ds_train[,58])
if (i-1 %% 4 == 0) {
par(mfrow=c(1,4))
}
barplot(counts, main = names(ds_train[i]), beside = T, legend = rownames(counts), xlab = names(ds_train[i]),
args.legend=list(title="V58"), ylab = "Frequency", col = c("#E7B800", "#00AFBB"))
}
## Normalized Data
lr_norm_train = glm(V58~., data = normalized_train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lr_norm_train_prob = predict(lr_norm_train, normalized_train, type = "response")
lr_norm_train_predict = ifelse(lr_norm_train_prob > 0.5, 1, 0)
table(lr_norm_train_predict, spam_train$V58)
##
## lr_norm_train_predict 0 1
## 0 1762 133
## 1 87 1085
# classification error
(lr_norm_train_error = mean(lr_norm_train_predict != spam_train$V58))
## [1] 0.07173133
lr_norm_test_prob = predict(lr_norm_train, normalized_test, type = "response")
lr_norm_test_predict = ifelse(lr_norm_test_prob > 0.5, 1, 0)
table(lr_norm_test_predict, spam_test$V58)
##
## lr_norm_test_predict 0 1
## 0 877 70
## 1 39 548
# classification error
(lr_norm_test_error = mean(lr_norm_test_predict != spam_test$V58))
## [1] 0.07105606
## Log Transformed Data
lr_log_train = glm(V58~., data = log_train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lr_log_train_prob = predict(lr_log_train, log_train, type = "response")
lr_log_train_predict = ifelse(lr_log_train_prob > 0.5, 1, 0)
table(lr_log_train_predict, spam_train$V58)
##
## lr_log_train_predict 0 1
## 0 1766 94
## 1 83 1124
# classification error
(lr_log_train_error = mean(lr_log_train_predict != spam_train$V58))
## [1] 0.05771112
lr_log_test_prob = predict(lr_log_train, log_test, type = "response")
lr_log_test_predict = ifelse(lr_log_test_prob > 0.5, 1, 0)
table(lr_log_test_predict, spam_test$V58)
##
## lr_log_test_predict 0 1
## 0 879 50
## 1 37 568
# classification error
(lr_log_test_error = mean(lr_log_test_predict != spam_test$V58))
## [1] 0.05671447
## Discretized Data
lr_ds_train = glm(V58~., data = ds_train, family = binomial)
lr_ds_train_prob = predict(lr_ds_train, ds_train, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
lr_ds_train_predict = ifelse(lr_ds_train_prob > 0.5, 1, 0)
table(lr_ds_train_predict, spam_train$V58)
##
## lr_ds_train_predict 0 1
## 0 1779 105
## 1 70 1113
# classification error
(lr_ds_train_error = mean(lr_ds_train_predict != spam_train$V58))
## [1] 0.05705902
lr_ds_test_prob = predict(lr_ds_train, ds_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
lr_ds_test_predict = ifelse(lr_ds_test_prob > 0.5, 1, 0)
table(lr_ds_test_predict, spam_test$V58)
##
## lr_ds_test_predict 0 1
## 0 859 67
## 1 57 551
# classification error
(lr_ds_test_error = mean(lr_ds_test_predict != spam_test$V58))
## [1] 0.08083442
library(MASS)
train_V58 = spam_train$V58
test_V58 = spam_test$V58
Linear Discriminant Analysis: LDA
## Normalized Data
LDA_norm_train = lda(normalized_train[1:57], group = train_V58)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
predict_norm_lda = LDA_norm_train %>% predict(normalized_test[1:57])
# classification error
(LDA_norm_test_error = mean(predict_norm_lda$class!=test_V58))
## [1] 0.1029987
predict_norm_lda = LDA_norm_train %>% predict(normalized_train[1:57])
# classification error
(LDA_norm_train_error = mean(predict_norm_lda$class!=train_V58))
## [1] 0.1017281
## Log Transformed Data
LDA_log_train = lda(log_train[1:57], group = train_V58)
predict_log_lda = LDA_log_train %>% predict(log_test[1:57])
# classification error
(LDA_log_test_error = mean(predict_log_lda$class!=test_V58))
## [1] 0.06518905
predict_log_lda = LDA_log_train %>% predict(log_train[1:57])
# classification error
(LDA_log_train_error = mean(predict_log_lda$class!=train_V58))
## [1] 0.06031953
Quadratic Discriminant Analysis: QDA
## Normalized Data
QDA_norm_train = qda(normalized_train[1:57], group = train_V58)
predict_norm_qda = QDA_norm_train %>% predict(normalized_test[1:57])
# classification error
(QDA_norm_test_error = mean(predict_norm_qda$class!=test_V58))
## [1] 0.1747066
predict_norm_qda = QDA_norm_train %>% predict(normalized_train[1:57])
# classification error
(QDA_norm_train_error = mean(predict_norm_qda$class!=train_V58))
## [1] 0.1786762
## Log Transformed Data
QDA_log_train = qda(log_train[1:57], group = train_V58)
predict_log_qda = QDA_log_train %>% predict(log_test[1:57])
# classification error
(QDA_log_test_error = mean(predict_log_qda$class!=test_V58))
## [1] 0.1571056
predict_log_qda = QDA_log_train %>% predict(log_train[1:57])
# classification error
(QDA_log_train_error = mean(predict_log_qda$class!=train_V58))
## [1] 0.1587871
library(e1071)
Normalized Data
normalized_train$V58 = as.factor(normalized_train$V58)
normalized_test$V58 = as.factor(normalized_test$V58)
### Linear SVM
norm_lsvm = svm(formula = V58~., data = normalized_train, kernel = "linear", cost = 1, scale = FALSE)
summary(norm_lsvm)
##
## Call:
## svm(formula = V58 ~ ., data = normalized_train, kernel = "linear",
## cost = 1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 621
##
## ( 315 306 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
norm_ltune = tune(svm, V58~., data = normalized_train, kernel = "linear", ranges = list(cost=c(0.01, 0.1, 1, 10, 100)))
summary(norm_ltune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 100
##
## - best performance: 0.073374
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.08315450 0.01677995
## 2 1e-01 0.07957144 0.01884720
## 3 1e+00 0.07500479 0.01633094
## 4 1e+01 0.07533265 0.01652315
## 5 1e+02 0.07337400 0.01491234
norm_lbestmod = norm_ltune$best.model
summary(norm_lbestmod)
##
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = normalized_train,
## ranges = list(cost = c(0.01, 0.1, 1, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 100
##
## Number of Support Vectors: 595
##
## ( 310 285 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
norm_lbestmod$cost
## [1] 100
norm_lypred_train = predict(norm_lbestmod, normalized_train)
table(predict = norm_lypred_train, truth = normalized_train$V58)
## truth
## predict 0 1
## 0 1771 121
## 1 78 1097
# classification error
(norm_train_lsvm_error = mean(norm_lypred_train != train_V58))
## [1] 0.06488425
norm_lypred_test = predict(norm_lbestmod, normalized_test)
table(predict = norm_lypred_test, truth = normalized_test$V58)
## truth
## predict 0 1
## 0 878 68
## 1 38 550
# classification error
(norm_test_lsvm_error = mean(norm_lypred_test != test_V58))
## [1] 0.06910039
### Non-Linear SVM
norm_nsvm = svm(formula = V58~., data = normalized_train, kernel = "radial", cost = 1, scale = FALSE)
summary(norm_nsvm)
##
## Call:
## svm(formula = V58 ~ ., data = normalized_train, kernel = "radial",
## cost = 1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 926
##
## ( 492 434 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
norm_ntune = tune(svm, V58~., data = normalized_train, kernel = "radial", ranges = list(cost=c(0.01, 0.1, 1, 10, 100, 1000), gamma=c(0.1,0.5,1)))
summary(norm_ntune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.1
##
## - best performance: 0.06422261
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-02 0.1 0.39713972 0.02965918
## 2 1e-01 0.1 0.18746673 0.02384839
## 3 1e+00 0.1 0.06846139 0.02081349
## 4 1e+01 0.1 0.06422261 0.02064662
## 5 1e+02 0.1 0.06585233 0.01712869
## 6 1e+03 0.1 0.07074578 0.01336101
## 7 1e-02 0.5 0.39713972 0.02965918
## 8 1e-01 0.5 0.36388516 0.02986606
## 9 1e+00 0.5 0.12324839 0.02473036
## 10 1e+01 0.5 0.11966213 0.02366826
## 11 1e+02 0.5 0.12227119 0.02442247
## 12 1e+03 0.5 0.12325158 0.02715628
## 13 1e-02 1.0 0.39713972 0.02965918
## 14 1e-01 1.0 0.37953099 0.02817976
## 15 1e+00 1.0 0.13597113 0.02877010
## 16 1e+01 1.0 0.13368781 0.02940463
## 17 1e+02 1.0 0.13368781 0.02980286
## 18 1e+03 1.0 0.13466714 0.03154766
norm_nbestmod = norm_ntune$best.model
summary(norm_nbestmod)
##
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = normalized_train,
## ranges = list(cost = c(0.01, 0.1, 1, 10, 100, 1000), gamma = c(0.1,
## 0.5, 1)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 1369
##
## ( 746 623 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
norm_nbestmod$cost
## [1] 10
norm_nbestmod$gamma
## [1] 0.1
norm_nypred_train = predict(norm_nbestmod, normalized_train)
table(predict = norm_nypred_train, truth = normalized_train$V58)
## truth
## predict 0 1
## 0 1848 15
## 1 1 1203
# classification error
(norm_train_nsvm_error = mean(norm_nypred_train != train_V58))
## [1] 0.005216824
norm_nypred_test = predict(norm_nbestmod, normalized_test)
table(predict = norm_nypred_test, truth = normalized_test$V58)
## truth
## predict 0 1
## 0 885 65
## 1 31 553
# classification error
(norm_test_nsvm_error = mean(norm_nypred_test != test_V58))
## [1] 0.06258149
Log Data
log_train$V58 = as.factor(log_train$V58)
log_test$V58 = as.factor(log_test$V58)
### Linear SVM
log_lsvm = svm(formula = V58~., data = log_train, kernel = "linear", cost = 1, scale = FALSE)
summary(log_lsvm)
##
## Call:
## svm(formula = V58 ~ ., data = log_train, kernel = "linear", cost = 1,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 529
##
## ( 273 256 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
log_ltune = tune(svm, V58~., data = log_train, kernel = "linear", ranges = list(cost=c(0.001, 0.01, 0.1, 1, 10)))
summary(log_ltune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.05934406
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.06912776 0.01523963
## 2 1e-02 0.05934406 0.01433521
## 3 1e-01 0.06064699 0.01734521
## 4 1e+00 0.06097379 0.01840285
## 5 1e+01 0.06162632 0.01824241
log_lbestmod = log_ltune$best.model
summary(log_lbestmod)
##
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = log_train, ranges = list(cost = c(0.001,
## 0.01, 0.1, 1, 10)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 680
##
## ( 343 337 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
log_lbestmod$cost
## [1] 0.01
log_lypred_train = predict(log_lbestmod, log_train)
table(predict = log_lypred_train, truth = log_train$V58)
## truth
## predict 0 1
## 0 1777 109
## 1 72 1109
# classification error
(log_train_lsvm_error = mean(log_lypred_train != train_V58))
## [1] 0.05901532
log_lypred_test = predict(log_lbestmod, log_test)
table(predict = log_lypred_test, truth = log_test$V58)
## truth
## predict 0 1
## 0 880 53
## 1 36 565
# classification error
(log_test_lsvm_error = mean(log_lypred_test != test_V58))
## [1] 0.05801825
### Non-Linear SVM
log_nsvm = svm(formula = V58~., data = log_train, kernel = "radial", cost = 1, scale = FALSE)
summary(log_nsvm)
##
## Call:
## svm(formula = V58 ~ ., data = log_train, kernel = "radial", cost = 1,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 877
##
## ( 442 435 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
log_ntune = tune(svm, V58~., data = log_train, kernel = "radial", ranges = list(cost=c(0.1, 1, 10, 100, 1000), gamma=c(0.001,0.01,0.1)))
summary(log_ntune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.01
##
## - best performance: 0.04696089
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.001 0.10631028 0.01739720
## 2 1e+00 0.001 0.06326244 0.01051033
## 3 1e+01 0.001 0.06065551 0.01611446
## 4 1e+02 0.001 0.05478700 0.01419978
## 5 1e+03 0.001 0.04826489 0.01183385
## 6 1e-01 0.010 0.06456643 0.01379263
## 7 1e+00 0.010 0.05575994 0.01440965
## 8 1e+01 0.010 0.04696089 0.01503510
## 9 1e+02 0.010 0.04696515 0.01625991
## 10 1e+03 0.010 0.04761449 0.01689666
## 11 1e-01 0.100 0.25431969 0.03166235
## 12 1e+00 0.100 0.07141321 0.01620726
## 13 1e+01 0.100 0.06554896 0.01727559
## 14 1e+02 0.100 0.06619829 0.01737204
## 15 1e+03 0.100 0.06749910 0.01428723
log_nbestmod = log_ntune$best.model
summary(log_nbestmod)
##
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = log_train, ranges = list(cost = c(0.1,
## 1, 10, 100, 1000), gamma = c(0.001, 0.01, 0.1)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 609
##
## ( 323 286 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
log_nbestmod$cost
## [1] 10
log_nbestmod$gamma
## [1] 0.01
log_nypred_train = predict(log_nbestmod, log_train)
table(predict = log_nypred_train, truth = log_train$V58)
## truth
## predict 0 1
## 0 1826 45
## 1 23 1173
# classification error
(log_train_nsvm_error = mean(log_nypred_train != train_V58))
## [1] 0.0221715
log_nypred_test = predict(log_nbestmod, log_test)
table(predict = log_nypred_test, truth = log_test$V58)
## truth
## predict 0 1
## 0 892 34
## 1 24 584
# classification error
(log_test_nsvm_error = mean(log_nypred_test != test_V58))
## [1] 0.03780965
Discretized Data
ds_train$V58 = as.factor(ds_train$V58)
ds_test$V58 = as.factor(ds_test$V58)
### Linear SVM
ds_lsvm = svm(formula = V58~., data = ds_train, kernel = "linear", cost = 1, scale = FALSE)
summary(ds_lsvm)
##
## Call:
## svm(formula = V58 ~ ., data = ds_train, kernel = "linear", cost = 1,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 560
##
## ( 280 280 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
ds_ltune = tune(svm, V58~., data = ds_train, kernel = "linear", ranges = list(cost=c(0.01, 0.1, 1, 10, 100)))
summary(ds_ltune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.06652509
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.07695174 0.01134329
## 2 1e-01 0.06945349 0.01435534
## 3 1e+00 0.06652509 0.01247640
## 4 1e+01 0.06913628 0.01534461
## 5 1e+02 0.06946094 0.01407189
ds_lbestmod = ds_ltune$best.model
summary(ds_lbestmod)
##
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = ds_train, ranges = list(cost = c(0.01,
## 0.1, 1, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 560
##
## ( 280 280 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
ds_lbestmod$cost
## [1] 1
ds_lypred_train = predict(ds_lbestmod, ds_train)
table(predict = ds_lypred_train, truth = ds_train$V58)
## truth
## predict 0 1
## 0 1780 116
## 1 69 1102
# classification error
(ds_train_lsvm_error = mean(ds_lypred_train != train_V58))
## [1] 0.06031953
ds_lypred_test = predict(ds_lbestmod, ds_test)
table(predict = ds_lypred_test, truth = ds_test$V58)
## truth
## predict 0 1
## 0 865 63
## 1 51 555
# classification error
(ds_test_lsvm_error = mean(ds_lypred_test != test_V58))
## [1] 0.07431551
### Non-Linear SVM
ds_nsvm = svm(formula = V58~., data = ds_train, kernel = "radial", cost = 1, scale = FALSE)
summary(ds_nsvm)
##
## Call:
## svm(formula = V58 ~ ., data = ds_train, kernel = "radial", cost = 1,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 814
##
## ( 411 403 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
ds_ntune = tune(svm, V58~., data = ds_train, kernel = "radial", ranges = list(cost=c(0.1, 1, 10, 100, 1000), gamma=c(0.1,0.5,1,1.5)))
summary(ds_ntune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 100 0.5
##
## - best performance: 0.04466266
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.1 0.07107258 0.012037349
## 2 1e+00 0.1 0.05835728 0.009856153
## 3 1e+01 0.1 0.04694599 0.011796695
## 4 1e+02 0.1 0.05215665 0.014451322
## 5 1e+03 0.1 0.05248025 0.015197744
## 6 1e-01 0.5 0.12227012 0.027584807
## 7 1e+00 0.5 0.04499053 0.011037153
## 8 1e+01 0.5 0.04498733 0.012897975
## 9 1e+02 0.5 0.04466266 0.011680089
## 10 1e+03 0.5 0.04466266 0.011680089
## 11 1e-01 1.0 0.36585233 0.032682283
## 12 1e+00 1.0 0.10759618 0.021438712
## 13 1e+01 1.0 0.10010006 0.019918645
## 14 1e+02 1.0 0.10010006 0.019918645
## 15 1e+03 1.0 0.10010006 0.019918645
## 16 1e-01 1.5 0.37530604 0.029262410
## 17 1e+00 1.5 0.12324200 0.022104985
## 18 1e+01 1.5 0.11867855 0.020872856
## 19 1e+02 1.5 0.11867855 0.020872856
## 20 1e+03 1.5 0.11867855 0.020872856
ds_nbestmod = ds_ntune$best.model
summary(ds_nbestmod)
##
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = ds_train, ranges = list(cost = c(0.1,
## 1, 10, 100, 1000), gamma = c(0.1, 0.5, 1, 1.5)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 100
##
## Number of Support Vectors: 1475
##
## ( 756 719 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
ds_nbestmod$cost
## [1] 100
ds_nbestmod$gamma
## [1] 0.5
ds_nypred_train = predict(ds_nbestmod, ds_train)
table(predict = ds_nypred_train, truth = ds_train$V58)
## truth
## predict 0 1
## 0 1842 14
## 1 7 1204
# classification error
(ds_train_nsvm_error = mean(ds_nypred_train != train_V58))
## [1] 0.006847082
ds_nypred_test = predict(ds_nbestmod, ds_test)
table(predict = ds_nypred_test, truth = ds_test$V58)
## truth
## predict 0 1
## 0 889 35
## 1 27 583
# classification error
(ds_test_nsvm_error = mean(ds_nypred_test != test_V58))
## [1] 0.04041721
Report classification errors using different methods and different preprocessed data in a table, and comment on the different performances.
error_matrix = matrix(data = NA, nrow = 5, ncol = 6)
colnames(error_matrix) = c("Normal train", "Normal test", "Log Train", "Log Test",
"Discretized Train", "Discretized Test")
rownames(error_matrix) = c("Logistic Regression", "LDA", "QDA", "SVM Linear", "SVM Gaussian")
error_matrix
## Normal train Normal test Log Train Log Test
## Logistic Regression NA NA NA NA
## LDA NA NA NA NA
## QDA NA NA NA NA
## SVM Linear NA NA NA NA
## SVM Gaussian NA NA NA NA
## Discretized Train Discretized Test
## Logistic Regression NA NA
## LDA NA NA
## QDA NA NA
## SVM Linear NA NA
## SVM Gaussian NA NA
error_matrix[1,1] = lr_norm_train_error
error_matrix[1,2] = lr_norm_test_error
error_matrix[1,3] = lr_log_train_error
error_matrix[1,4] = lr_log_test_error
error_matrix[1,5] = lr_ds_train_error
error_matrix[1,6] = lr_ds_test_error
error_matrix[2,1] = LDA_norm_train_error
error_matrix[2,2] = LDA_norm_test_error
error_matrix[2,3] = LDA_log_train_error
error_matrix[2,4] = LDA_log_test_error
error_matrix[2,5] = NA
error_matrix[2,6] = NA
error_matrix[3,1] = QDA_norm_train_error
error_matrix[3,2] = QDA_norm_test_error
error_matrix[3,3] = QDA_log_train_error
error_matrix[3,4] = QDA_log_test_error
error_matrix[3,5] = NA
error_matrix[3,6] = NA
error_matrix[4,1] = norm_train_lsvm_error
error_matrix[4,2] = norm_test_lsvm_error
error_matrix[4,3] = log_train_lsvm_error
error_matrix[4,4] = log_test_lsvm_error
error_matrix[4,5] = ds_train_lsvm_error
error_matrix[4,6] = ds_test_lsvm_error
error_matrix[5,1] = norm_train_nsvm_error
error_matrix[5,2] = norm_test_nsvm_error
error_matrix[5,3] = log_train_nsvm_error
error_matrix[5,4] = log_test_nsvm_error
error_matrix[5,5] = ds_train_nsvm_error
error_matrix[5,6] = ds_test_nsvm_error
error_matrix
## Normal train Normal test Log Train Log Test
## Logistic Regression 0.071731334 0.07105606 0.05771112 0.05671447
## LDA 0.101728073 0.10299870 0.06031953 0.06518905
## QDA 0.178676231 0.17470665 0.15878709 0.15710561
## SVM Linear 0.064884252 0.06910039 0.05901532 0.05801825
## SVM Gaussian 0.005216824 0.06258149 0.02217150 0.03780965
## Discretized Train Discretized Test
## Logistic Regression 0.057059015 0.08083442
## LDA NA NA
## QDA NA NA
## SVM Linear 0.060319530 0.07431551
## SVM Gaussian 0.006847082 0.04041721
The lowest testing classification error rate among the models and different preprocessed data was with the gaussian non-linear support vector machine model based on the log(xij + 1) transformation. It was reported at around 3.5%. All of the other testing classification errors hovered around 6%.
The lowest training classification error rate among the models and different preprocessed data was with the gaussian non-linear support vector machine model based on the normalized data. It was reported at <1%. All of the other training classification errors ranged from <1% to 17.8%.
Finally, use either a single method with properly chosen tuning parameter or a combination of several methods to design a classifier with test error rate as small as possible. Describe your recommended method, and report its performance.
# refer to Log transformed data Non-Linear SVM
best_preprocessed_data_train = log_train
best_preprocessed_data_test = log_test
best_method_model = log_nbestmod
summary(best_method_model)
##
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = log_train, ranges = list(cost = c(0.1,
## 1, 10, 100, 1000), gamma = c(0.001, 0.01, 0.1)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 609
##
## ( 323 286 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# classification error
log_test_nsvm_error
## [1] 0.03780965
Earlier in the code, we tuned the nonlinear SVM model based on all of the preprocessed data and found that the log transformed data with a cost = 10 and a gamma = 0.01 was the model that provided the smallest test error rate. We found that the testing error was reported at around 3.5%. The nonlinear SVM model was based off the gaussian or radial kernel which made the boundary line between the classes more like a circle or oval. This indicated that the data was not linearly separable.